1 Data Preparation and Setup

data_2020 <- read.csv("2020.csv")
data_2025 <- readRDS("crew_dataset_keys_analysis_ready.rds")

age_order <- c("Under 25", "25-29", "30-34", "35-39", "40-44", "45-49",
               "50-54", "55-59", "60-64", "65 or older")

standardize_gender <- function(gender_col) {
  gender_col <- tolower(trimws(gender_col))
  case_when(
    gender_col %in% c("female", "woman") ~ "Women",
    gender_col %in% c("male", "man") ~ "Men",
    grepl("nonbinary|non-binary|gender variant", gender_col) ~ "Nonbinary",
    TRUE ~ as.character(gender_col)
  )
}

df_2020 <- data_2020 %>%
  mutate(
    gender = standardize_gender(m27),
    age = factor(m25, levels = age_order)
  ) %>%
  filter(!is.na(gender) & !is.na(age) & gender != "" & age != "Decline to Answer")

clean_numeric_column <- function(col) {
  col_char <- as.character(col)
  col_clean <- gsub(",", "", col_char)
  col_clean <- gsub("\\s+", "", col_clean)
  col_numeric <- suppressWarnings(as.numeric(col_clean))
  col_numeric[col_numeric > 10000000] <- NA
  col_numeric[col_numeric < 0] <- 0
  return(col_numeric)
}

df_2025 <- data_2025 %>%
  mutate(
    gender = standardize_gender(m27),
    age = factor(m25, levels = age_order),
    base_salary = clean_numeric_column(`m21A[1_SQ001]`),
    bonus = clean_numeric_column(`m21A[2_SQ001]`),
    commission = clean_numeric_column(`m21A[3_SQ001]`),
    profit_sharing = clean_numeric_column(`m21A[4_SQ001]`),
    long_term_incentive = clean_numeric_column(`m21A[5_SQ001]`),
    total_comp_reported = clean_numeric_column(`m28B`),
    specialization_group = case_when(
      m1B %in% c("Asset/Property Management", "Corporate Real Estate", "Portfolio Management") ~ "Asset Management",
      m1B %in% c("Brokerage/Sales/Leasing") ~ "Brokerage", 
      m1B %in% c("Acquisitions/ Dispositions", "Architecture and Design", "Construction", 
                  "Development", "Economic Development", "Engineering", "Environmental",
                  "Interior Design", "Investments") ~ "Development",
      m1B %in% c("Accounting", "Appraisal/Valuation", "Consulting", "Executive Management",
                  "Finance/Lending/Mortgage", "Human Resources", "Law", 
                  "Marketing/Business Development", "Sustainability", "Title/Escrow") ~ "Finance",
      TRUE ~ "Other"
    ),
    ethnicity = case_when(
      `G20Q70` == "White (Non-Hispanic)" ~ "White",
      `G20Q70` == "African-American/Black" ~ "Black",
      `G20Q70` == "Asian" ~ "Asian",
      `G20Q70` == "Hispanic/Latinx (Any Race)" ~ "Hispanic/Latinx",
      `G20Q70` == "Other/mixed" ~ "Other/Mixed",
      `G20Q70` %in% c("Native American or Alaskan", "Middle Eastern or North African") ~ "Other/Mixed",
      `G20Q70` == "Prefer not to answer" ~ NA_character_,
      TRUE ~ NA_character_
    ),
    total_compensation = case_when(
      !is.na(total_comp_reported) & total_comp_reported > 0 ~ total_comp_reported,
      TRUE ~ (replace_na(base_salary, 0) + replace_na(bonus, 0) + 
              replace_na(commission, 0) + replace_na(profit_sharing, 0) + 
              replace_na(long_term_incentive, 0))
    )
  ) %>%
  filter(!is.na(gender) & !is.na(age) & gender != "" & age != "Decline to Answer")

gender_colors <- c("Women" = "#e74c3c", "Men" = "#3498db", "Nonbinary" = "#9b59b6")
ethnicity_colors <- c("White" = "#3498db", "Black" = "#e74c3c", "Asian" = "#f39c12", 
                     "Hispanic/Latinx" = "#27ae60", "Other/Mixed" = "#9b59b6")

1.1 Age Distribution Weighting Framework

women_2020_age_counts <- df_2020 %>%
  filter(gender == "Women") %>%
  count(age, name = "count_2020") %>%
  mutate(prop_2020 = count_2020 / sum(count_2020))

kable(women_2020_age_counts, 
      caption = "2020 Women Age Distribution (Reference for Weighting)",
      col.names = c("Age Group", "Count 2020", "Proportion 2020"),
      digits = 3) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, position = "center") %>%
  column_spec(1, bold = TRUE) %>%
  row_spec(0, bold = TRUE, color = "white", background = "#3498db")
2020 Women Age Distribution (Reference for Weighting)
Age Group Count 2020 Proportion 2020
Under 25 38 0.018
25-29 164 0.077
30-34 271 0.127
35-39 325 0.153
40-44 292 0.137
45-49 263 0.123
50-54 266 0.125
55-59 268 0.126
60-64 172 0.081
65 or older 72 0.034
women_age_weights <- women_2020_age_counts %>%
  select(age, weight = prop_2020)
theme_compensation <- theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5, margin = margin(b = 20)),
    plot.subtitle = element_text(size = 12, hjust = 0.5, color = "gray40", margin = margin(b = 15)),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 10),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10),
    legend.position = "bottom",
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_line(color = "gray90", size = 0.3),
    panel.grid.major.y = element_line(color = "gray90", size = 0.3),
    strip.text = element_text(size = 11, face = "bold"),
    plot.background = element_rect(fill = "white", color = NA),
    panel.background = element_rect(fill = "white", color = NA)
  )

format_currency <- function(x) {
  scales::dollar_format(accuracy = 1)(x)
}

apply_women_weighting <- function(data, question_col, show_breakdown = FALSE) {
  plot_data <- data %>%
    select(gender, age, response = all_of(question_col)) %>%
    filter(!is.na(response) & response != "")
  
  total_responses <- nrow(plot_data)
  
  men_data <- plot_data %>% filter(gender == "Men")
  women_data <- plot_data %>% filter(gender == "Women")
  
  men_summary <- men_data %>%
    count(response) %>%
    mutate(
      percent = n / sum(n) * 100,
      global_percent = n / total_responses * 100,
      gender = "Men",
      weighted_percent = percent
    )
  
  women_by_age <- women_data %>%
    count(age, response) %>%
    group_by(age) %>%
    mutate(percent_in_age = n / sum(n) * 100) %>%
    ungroup()
  
  women_weighted <- women_by_age %>%
    left_join(women_age_weights, by = "age") %>%
    mutate(weighted_contribution = percent_in_age * weight) %>%
    group_by(response) %>%
    summarise(
      weighted_percent = sum(weighted_contribution, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    mutate(gender = "Women")
  
  final_results <- bind_rows(
    men_summary %>% select(response, gender, weighted_percent),
    women_weighted %>% select(response, gender, weighted_percent)
  )
  
  return(final_results)
}

apply_compensation_weighting <- function(data, salary_col, group_col = NULL) {
  if (is.character(data[[salary_col]])) {
    data[[salary_col]] <- as.numeric(data[[salary_col]])
  }
  
  if (is.null(group_col)) {
    plot_data <- data %>%
      select(gender, age, salary = all_of(salary_col)) %>%
      filter(!is.na(salary) & salary > 0 & !is.infinite(salary))
  } else {
    plot_data <- data %>%
      select(gender, age, salary = all_of(salary_col), group = all_of(group_col)) %>%
      filter(!is.na(salary) & salary > 0 & !is.infinite(salary) & !is.na(group) & group != "")
  }
  
  men_data <- plot_data %>% filter(gender == "Men")
  women_data <- plot_data %>% filter(gender == "Women")
  
  if (is.null(group_col)) {
    men_stats <- men_data %>%
      summarise(
        mean_salary = mean(salary, na.rm = TRUE),
        median_salary = median(salary, na.rm = TRUE),
        gender = "Men"
      )
    
    women_by_age <- women_data %>%
      group_by(age) %>%
      summarise(
        mean_salary = mean(salary, na.rm = TRUE),
        median_salary = median(salary, na.rm = TRUE),
        .groups = "drop"
      ) %>%
      left_join(women_age_weights, by = "age") %>%
      filter(!is.na(weight))
    
    women_weighted <- women_by_age %>%
      summarise(
        mean_salary = sum(mean_salary * weight, na.rm = TRUE),
        median_salary = sum(median_salary * weight, na.rm = TRUE),
        gender = "Women",
        .groups = "drop"
      )
    
    return(bind_rows(men_stats, women_weighted))
    
  } else {
    men_stats <- men_data %>%
      group_by(group) %>%
      summarise(
        mean_salary = mean(salary, na.rm = TRUE),
        median_salary = median(salary, na.rm = TRUE),
        gender = "Men",
        .groups = "drop"
      )
    
    women_by_age_group <- women_data %>%
      group_by(age, group) %>%
      summarise(
        mean_salary = mean(salary, na.rm = TRUE),
        median_salary = median(salary, na.rm = TRUE),
        .groups = "drop"
      ) %>%
      left_join(women_age_weights, by = "age") %>%
      filter(!is.na(weight))
    
    women_weighted <- women_by_age_group %>%
      group_by(group) %>%
      summarise(
        mean_salary = sum(mean_salary * weight, na.rm = TRUE),
        median_salary = sum(median_salary * weight, na.rm = TRUE),
        gender = "Women",
        .groups = "drop"
      )
    
    return(bind_rows(men_stats, women_weighted))
  }
}

calculate_pay_gap <- function(data) {
  women_salary <- data %>% filter(gender == "Women") %>% pull(mean_salary)
  men_salary <- data %>% filter(gender == "Men") %>% pull(mean_salary)
  if (length(women_salary) > 0 && length(men_salary) > 0) {
    round((1 - women_salary/men_salary) * 100, 1)
  } else {
    NA
  }
}

2 Comprehensive Compensation Analysis

2.1 Base Salary Analysis by Specialization

2.1.1 Mean Base Salary by Specialization

salary_spec_data <- apply_compensation_weighting(df_2025, "base_salary", "specialization_group")

salary_mean_table <- salary_spec_data %>%
  select(specialization = group, gender, mean_salary) %>%
  pivot_wider(names_from = gender, values_from = mean_salary, names_prefix = "") %>%
  mutate(Gap = round((1 - Women/Men) * 100, 1)) %>%
  arrange(desc(Men))

# Add overall mean row
overall_mean_women <- salary_spec_data %>% filter(gender == "Women") %>% summarise(avg = mean(mean_salary)) %>% pull(avg)
overall_mean_men <- salary_spec_data %>% filter(gender == "Men") %>% summarise(avg = mean(mean_salary)) %>% pull(avg)

mean_row <- data.frame(
  specialization = "OVERALL MEAN",
  Women = overall_mean_women,
  Men = overall_mean_men
) %>%
  mutate(Gap = round((1 - Women/Men) * 100, 1))

final_mean_salary_table <- bind_rows(salary_mean_table, mean_row)

kable(final_mean_salary_table,
      caption = "Mean Annual Base Salary by Specialization (Excluding Commissions and Bonuses)",
      col.names = c("Specialization", "Women", "Men", "Gap %"),
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(nrow(final_mean_salary_table), bold = TRUE, background = "#e74c3c", color = "white")
Mean Annual Base Salary by Specialization (Excluding Commissions and Bonuses)
Specialization Women Men Gap %
Development 210,767.64 118,091.68 44.0
Finance 132,951.47 134,633.58 -1.3
Other 108,731.11 97,331.19 10.5
Asset Management 101,926.70 134,754.26 -32.2
Brokerage 70,653.76 92,044.34 -30.3
OVERALL MEAN 125,006.14 115,371.01 7.7
# Plot for mean salaries
ggplot(salary_spec_data, aes(x = reorder(group, mean_salary), y = mean_salary, fill = gender)) +
  geom_col(position = "dodge", alpha = 0.8, width = 0.7) +
  geom_text(aes(label = format_currency(mean_salary)), 
            position = position_dodge(width = 0.7), hjust = -0.1, size = 3.5) +
  scale_fill_manual(values = gender_colors) +
  scale_y_continuous(labels = format_currency, expand = expansion(mult = c(0, 0.15))) +
  coord_flip() +
  labs(
    title = "Mean Base Salary by Specialization and Gender",
    subtitle = "Women's salaries weighted by 2020 age distribution",
    x = "Specialization",
    y = "Mean Annual Base Salary",
    fill = "Gender",
    caption = "Source: 2025 Survey Data"
  ) +
  theme_compensation

2.1.2 Median Base Salary by Specialization

# Calculate median salary by specialization using proper weighting approach
median_salary_spec <- df_2025 %>%
  filter(!is.na(base_salary) & base_salary > 0 & !is.na(specialization_group)) %>%
  group_by(specialization_group, gender) %>%
  summarise(median_salary = median(base_salary, na.rm = TRUE), .groups = "drop")

salary_median_table <- median_salary_spec %>%
  select(specialization = specialization_group, gender, median_salary) %>%
  pivot_wider(names_from = gender, values_from = median_salary, names_prefix = "") %>%
  arrange(desc(coalesce(Men, Women, 0)))

# Ensure we have columns for Women and Men even if missing
if(!"Women" %in% names(salary_median_table)) salary_median_table$Women <- NA
if(!"Men" %in% names(salary_median_table)) salary_median_table$Men <- NA

# Calculate gap and reorder columns
salary_median_table <- salary_median_table %>%
  mutate(Gap = round((1 - Women/Men) * 100, 1)) %>%
  select(specialization, Women, Men, Gap)

# Add overall median row
overall_median_data <- df_2025 %>%
  filter(!is.na(base_salary) & base_salary > 0) %>%
  group_by(gender) %>%
  summarise(median_salary = median(base_salary, na.rm = TRUE), .groups = "drop")

women_median <- ifelse(nrow(overall_median_data %>% filter(gender == "Women")) > 0,
                      overall_median_data %>% filter(gender == "Women") %>% pull(median_salary),
                      NA)
men_median <- ifelse(nrow(overall_median_data %>% filter(gender == "Men")) > 0,
                    overall_median_data %>% filter(gender == "Men") %>% pull(median_salary),
                    NA)

median_row <- data.frame(
  specialization = "OVERALL MEDIAN",
  Women = women_median,
  Men = men_median,
  Gap = round((1 - women_median/men_median) * 100, 1)
)

final_median_salary_table <- bind_rows(salary_median_table, median_row)

kable(final_median_salary_table,
      caption = "Median Annual Base Salary by Specialization (Excluding Commissions and Bonuses)",
      col.names = c("Specialization", "Women", "Men", "Gap %"),
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(nrow(final_median_salary_table), bold = TRUE, background = "#27ae60", color = "white")
Median Annual Base Salary by Specialization (Excluding Commissions and Bonuses)
Specialization Women Men Gap %
Development 112,250 125,000 10.2
Finance 120,000 116,925 -2.6
Other 100,000 97,850 -2.2
Asset Management 120,000 87,250 -37.5
Brokerage 75,000 60,000 -25.0
OVERALL MEDIAN 115,000 113,500 -1.3
# Plot for median salaries
ggplot(median_salary_spec, aes(x = reorder(specialization_group, median_salary), y = median_salary, fill = gender)) +
  geom_col(position = "dodge", alpha = 0.8, width = 0.7) +
  geom_text(aes(label = format_currency(median_salary)), 
            position = position_dodge(width = 0.7), hjust = -0.1, size = 3.5) +
  scale_fill_manual(values = gender_colors) +
  scale_y_continuous(labels = format_currency, expand = expansion(mult = c(0, 0.15))) +
  coord_flip() +
  labs(
    title = "Median Base Salary by Specialization and Gender",
    subtitle = "50th percentile compensation levels",
    x = "Specialization",
    y = "Median Annual Base Salary",
    fill = "Gender",
    caption = "Source: 2025 Survey Data"
  ) +
  theme_compensation

2.2 Commission and Bonus Analysis by Specialization

2.2.1 Mean Commission and Bonus by Specialization

df_2025_combined <- df_2025 %>%
  mutate(commission_bonus = (replace_na(commission, 0) + replace_na(bonus, 0)))

combined_spec_data <- apply_compensation_weighting(df_2025_combined, "commission_bonus", "specialization_group")

commission_mean_table <- combined_spec_data %>%
  select(specialization = group, gender, commission_bonus = mean_salary) %>%
  pivot_wider(names_from = gender, values_from = commission_bonus, names_prefix = "") %>%
  mutate(Gap = round((1 - Women/Men) * 100, 1)) %>%
  arrange(desc(Men))

# Add overall mean row
overall_mean_women_cb <- combined_spec_data %>% filter(gender == "Women") %>% summarise(avg = mean(mean_salary)) %>% pull(avg)
overall_mean_men_cb <- combined_spec_data %>% filter(gender == "Men") %>% summarise(avg = mean(mean_salary)) %>% pull(avg)

mean_cb_row <- data.frame(
  specialization = "OVERALL MEAN",
  Women = overall_mean_women_cb,
  Men = overall_mean_men_cb
) %>%
  mutate(Gap = round((1 - Women/Men) * 100, 1))

final_mean_commission_table <- bind_rows(commission_mean_table, mean_cb_row)

kable(final_mean_commission_table,
      caption = "Mean Annual Commission and Bonus Earnings by Specialization",
      col.names = c("Specialization", "Women", "Men", "Gap %"),
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(nrow(final_mean_commission_table), bold = TRUE, background = "#e74c3c", color = "white")
Mean Annual Commission and Bonus Earnings by Specialization
Specialization Women Men Gap %
Brokerage 258,557.69 126,184.82 51.2
Development 156,694.89 37,157.93 76.3
Finance 45,353.13 57,482.75 -26.7
Asset Management 45,011.38 35,084.87 22.1
Other 44,905.67 28,842.93 35.8
OVERALL MEAN 110,104.55 56,950.66 48.3
# Plot for mean commission and bonus
ggplot(combined_spec_data, aes(x = reorder(group, mean_salary), y = mean_salary, fill = gender)) +
  geom_col(position = "dodge", alpha = 0.8, width = 0.7) +
  geom_text(aes(label = format_currency(mean_salary)), 
            position = position_dodge(width = 0.7), hjust = -0.1, size = 3.5) +
  scale_fill_manual(values = gender_colors) +
  scale_y_continuous(labels = format_currency, expand = expansion(mult = c(0, 0.15))) +
  coord_flip() +
  labs(
    title = "Mean Commission and Bonus Earnings by Specialization and Gender",
    subtitle = "Women's earnings weighted by 2020 age distribution",
    x = "Specialization",
    y = "Mean Annual Commission + Bonus",
    fill = "Gender",
    caption = "Source: 2025 Survey Data"
  ) +
  theme_compensation

2.2.2 Median Commission and Bonus by Specialization

# Calculate median commission and bonus by specialization
median_cb_spec <- df_2025_combined %>%
  filter(!is.na(commission_bonus) & commission_bonus > 0 & !is.na(specialization_group)) %>%
  group_by(specialization_group, gender) %>%
  summarise(median_cb = median(commission_bonus, na.rm = TRUE), .groups = "drop")

commission_median_table <- median_cb_spec %>%
  select(specialization = specialization_group, gender, median_cb) %>%
  pivot_wider(names_from = gender, values_from = median_cb, names_prefix = "") %>%
  arrange(desc(coalesce(Men, Women, 0)))

# Ensure we have columns for Women and Men even if missing
if(!"Women" %in% names(commission_median_table)) commission_median_table$Women <- NA
if(!"Men" %in% names(commission_median_table)) commission_median_table$Men <- NA

# Calculate gap and reorder columns
commission_median_table <- commission_median_table %>%
  mutate(Gap = round((1 - Women/Men) * 100, 1)) %>%
  select(specialization, Women, Men, Gap)

# Add overall median row
overall_median_cb_data <- df_2025_combined %>%
  filter(!is.na(commission_bonus) & commission_bonus > 0) %>%
  group_by(gender) %>%
  summarise(median_cb = median(commission_bonus, na.rm = TRUE), .groups = "drop")

women_median_cb <- ifelse(nrow(overall_median_cb_data %>% filter(gender == "Women")) > 0,
                         overall_median_cb_data %>% filter(gender == "Women") %>% pull(median_cb),
                         NA)
men_median_cb <- ifelse(nrow(overall_median_cb_data %>% filter(gender == "Men")) > 0,
                       overall_median_cb_data %>% filter(gender == "Men") %>% pull(median_cb),
                       NA)

median_cb_row <- data.frame(
  specialization = "OVERALL MEDIAN",
  Women = women_median_cb,
  Men = men_median_cb,
  Gap = round((1 - women_median_cb/men_median_cb) * 100, 1)
)

final_median_commission_table <- bind_rows(commission_median_table, median_cb_row)

kable(final_median_commission_table,
      caption = "Median Annual Commission and Bonus Earnings by Specialization",
      col.names = c("Specialization", "Women", "Men", "Gap %"),
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(nrow(final_median_commission_table), bold = TRUE, background = "#27ae60", color = "white")
Median Annual Commission and Bonus Earnings by Specialization
Specialization Women Men Gap %
Brokerage 50,000.0 110,000 54.5
Finance 17,250.5 22,000 21.6
Other 10,000.0 20,000 50.0
Development 10,000.0 18,000 44.4
Asset Management 20,000.0 16,000 -25.0
OVERALL MEDIAN 15,000.0 20,000 25.0
# Plot for median commission and bonus
ggplot(median_cb_spec, aes(x = reorder(specialization_group, median_cb), y = median_cb, fill = gender)) +
  geom_col(position = "dodge", alpha = 0.8, width = 0.7) +
  geom_text(aes(label = format_currency(median_cb)), 
            position = position_dodge(width = 0.7), hjust = -0.1, size = 3.5) +
  scale_fill_manual(values = gender_colors) +
  scale_y_continuous(labels = format_currency, expand = expansion(mult = c(0, 0.15))) +
  coord_flip() +
  labs(
    title = "Median Commission and Bonus Earnings by Specialization and Gender",
    subtitle = "50th percentile performance-based compensation",
    x = "Specialization",
    y = "Median Annual Commission + Bonus",
    fill = "Gender",
    caption = "Source: 2025 Survey Data"
  ) +
  theme_compensation

2.3 Total Compensation Overview

total_comp_data <- apply_compensation_weighting(df_2025, "total_compensation")
pay_gap <- calculate_pay_gap(total_comp_data)

ggplot(total_comp_data, aes(x = gender, y = mean_salary, fill = gender)) +
  geom_col(width = 0.6, alpha = 0.8) +
  geom_text(aes(label = format_currency(mean_salary)), 
            vjust = -0.5, size = 5, fontface = "bold") +
  scale_fill_manual(values = gender_colors) +
  scale_y_continuous(labels = format_currency, expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Total Annual Compensation by Gender",
    subtitle = paste0("Women's compensation weighted by 2020 age distribution | Gender pay gap: ", pay_gap, "%"),
    x = "Gender", 
    y = "Mean Total Compensation",
    caption = "Source: 2025 Survey Data"
  ) +
  theme_compensation +
  guides(fill = "none")

2.4 Total Annual Compensation Distribution

compensation_ranges <- df_2025 %>%
  filter(!is.na(total_compensation) & total_compensation > 0 & !is.na(gender)) %>%
  mutate(
    salary_range = case_when(
      total_compensation < 50000 ~ "Less than $50,000",
      total_compensation >= 50000 & total_compensation <= 74999 ~ "$50,000 — $74,999",
      total_compensation >= 75000 & total_compensation <= 99999 ~ "$75,000 — $99,999",
      total_compensation >= 100000 & total_compensation <= 149999 ~ "$100,000 — $149,999",
      total_compensation >= 150000 & total_compensation <= 199999 ~ "$150,000 — $199,999",
      total_compensation >= 200000 & total_compensation <= 249999 ~ "$200,000 — $249,999",
      total_compensation >= 250000 & total_compensation <= 349999 ~ "$250,000 — $349,999",
      total_compensation >= 350000 & total_compensation <= 500000 ~ "$350,000 — $500,000",
      total_compensation > 500000 ~ "More than $500,000",
      TRUE ~ "Other"
    ),
    salary_range = factor(salary_range, levels = c(
      "Less than $50,000", "$50,000 — $74,999", "$75,000 — $99,999", 
      "$100,000 — $149,999", "$150,000 — $199,999", "$200,000 — $249,999",
      "$250,000 — $349,999", "$350,000 — $500,000", "More than $500,000"
    ))
  )

# Calculate percentages by gender and overall
comp_distribution <- compensation_ranges %>%
  group_by(salary_range, gender) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(gender) %>%
  mutate(
    total_by_gender = sum(count),
    pct_by_gender = round((count / total_by_gender) * 100, 1)
  ) %>%
  ungroup() %>%
  select(salary_range, gender, count, pct_by_gender)

# Calculate overall percentages
overall_distribution <- compensation_ranges %>%
  group_by(salary_range) %>%
  summarise(total_count = n(), .groups = "drop") %>%
  mutate(
    total_respondents = sum(total_count),
    pct_total = round((total_count / total_respondents) * 100, 1)
  )

# Reshape for table format
comp_table <- comp_distribution %>%
  select(salary_range, gender, pct_by_gender) %>%
  pivot_wider(names_from = gender, values_from = pct_by_gender, values_fill = 0) %>%
  left_join(overall_distribution %>% select(salary_range, pct_total), by = "salary_range") %>%
  arrange(salary_range)

# Ensure we have columns for Women and Men even if missing
if(!"Women" %in% names(comp_table)) comp_table$Women <- 0
if(!"Men" %in% names(comp_table)) comp_table$Men <- 0

# Reorder columns
comp_table <- comp_table %>%
  select(salary_range, Women, Men, pct_total) %>%
  arrange(match(salary_range, levels(compensation_ranges$salary_range)))

kable(comp_table,
      caption = "Total Annual Compensation Distribution (All Respondents)",
      col.names = c("Compensation Range", "Women (%)", "Men (%)", "All Respondents (%)"),
      digits = 1) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  add_header_above(c(" " = 1, "Percentage in Each Range" = 3)) %>%
  row_spec(0, bold = TRUE, color = "white", background = "#3498db")
Total Annual Compensation Distribution (All Respondents)
Percentage in Each Range
Compensation Range Women (%) Men (%) All Respondents (%)
Less than $50,000 11.7 13.9 12.1
$50,000 — $74,999 5.6 4.8 5.5
$75,000 — $99,999 11.4 9.4 11.1
$100,000 — $149,999 24.7 17.8 23.8
$150,000 — $199,999 16.7 16.9 16.7
$200,000 — $249,999 9.9 13.6 10.4
$250,000 — $349,999 10.1 10.0 10.1
$350,000 — $500,000 5.2 6.6 5.4
More than $500,000 4.6 6.9 4.9

2.5 Compensation Components Breakdown

comp_components <- list(
  "Base Salary" = apply_compensation_weighting(df_2025, "base_salary"),
  "Commission" = apply_compensation_weighting(df_2025, "commission"),
  "Bonus" = apply_compensation_weighting(df_2025, "bonus"),
  "Profit Sharing" = apply_compensation_weighting(df_2025, "profit_sharing"),
  "Long-Term Incentive" = apply_compensation_weighting(df_2025, "long_term_incentive")
)

all_comp_data <- map_dfr(comp_components, identity, .id = "component") %>%
  mutate(
    component = factor(component, levels = c("Base Salary", "Commission", "Bonus", 
                                           "Profit Sharing", "Long-Term Incentive"))
  )

total_comp_by_gender <- apply_compensation_weighting(df_2025, "total_compensation")
women_total <- total_comp_by_gender %>% filter(gender == "Women") %>% pull(mean_salary)
men_total <- total_comp_by_gender %>% filter(gender == "Men") %>% pull(mean_salary)

all_comp_data <- all_comp_data %>%
  mutate(
    total_comp = ifelse(gender == "Women", women_total, men_total),
    percentage = round((mean_salary / total_comp) * 100, 1),
    label = paste0(format_currency(mean_salary), "\n(", percentage, "%)")
  )

ggplot(all_comp_data, aes(x = component, y = mean_salary, fill = gender)) +
  geom_col(position = "dodge", alpha = 0.8, width = 0.7) +
  geom_text(aes(label = label), 
            position = position_dodge(width = 0.7), vjust = -0.3, size = 3, fontface = "bold") +
  scale_fill_manual(values = gender_colors) +
  scale_y_continuous(labels = format_currency, expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Compensation Components Analysis by Gender",
    subtitle = "Dollar amounts and percentages of total compensation shown",
    x = "Compensation Component",
    y = "Mean Annual Amount",
    fill = "Gender",
    caption = "Source: 2025 Survey Data"
  ) +
  theme_compensation +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

2.6 Compensation by Career Level

position_comp_data <- apply_compensation_weighting(df_2025, "total_compensation", "m3A")

position_comp_filtered <- position_comp_data %>%
  filter(!group %in% c("Unemployed", "Retired")) %>%
  mutate(
    group = case_when(
      str_detect(group, "C-Suite") ~ "C-Suite",
      str_detect(group, "SVP|Vice President|Managing Director|Partner") ~ "VP/SVP/MD/Partner",
      str_detect(group, "Senior level") ~ "Senior Level",
      str_detect(group, "Mid-level|Associate") ~ "Mid-Level/Associate", 
      str_detect(group, "Entry level") ~ "Entry Level",
      str_detect(group, "Self-employed|Independent") ~ "Self-Employed",
      TRUE ~ group
    )
  ) %>%
  filter(!is.na(group))

position_order <- c("Entry Level", "Mid-Level/Associate", "Senior Level", 
                   "VP/SVP/MD/Partner", "C-Suite", "Self-Employed")

position_comp_filtered <- position_comp_filtered %>%
  mutate(group = factor(group, levels = position_order))

ggplot(position_comp_filtered, aes(x = group, y = mean_salary, fill = gender)) +
  geom_col(position = "dodge", alpha = 0.8, width = 0.7) +
  geom_text(aes(label = format_currency(mean_salary)), 
            position = position_dodge(width = 0.7), vjust = -0.3, size = 3.5) +
  scale_fill_manual(values = gender_colors) +
  scale_y_continuous(labels = format_currency, expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Total Compensation by Position Level and Gender",
    subtitle = "Clear hierarchy progression shown",
    x = "Position Level",
    y = "Mean Total Compensation", 
    fill = "Gender",
    caption = "Source: 2025 Survey Data"
  ) +
  theme_compensation +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9))

2.7 Compensation by Age Groups

age_comp_data <- apply_compensation_weighting(df_2025, "total_compensation", "m25")

age_comp_filtered <- age_comp_data %>%
  filter(!is.na(group)) %>%
  mutate(group = factor(group, levels = age_order))

ggplot(age_comp_filtered, aes(x = group, y = mean_salary, fill = gender)) +
  geom_col(position = "dodge", alpha = 0.8, width = 0.7) +
  geom_text(aes(label = format_currency(mean_salary)), 
            position = position_dodge(width = 0.7), vjust = -0.3, size = 3.5) +
  scale_fill_manual(values = gender_colors) +
  scale_y_continuous(labels = format_currency, expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Total Compensation by Age Group and Gender",
    subtitle = "Career progression patterns visible across age groups",
    x = "Age Group",
    y = "Mean Total Compensation",
    fill = "Gender",
    caption = "Source: 2025 Survey Data"
  ) +
  theme_compensation +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

3 Diversity and Inclusion Analysis

3.1 Compensation by Race and Ethnicity

women_race_salary <- df_2025 %>%
  filter(gender == "Women" & !is.na(ethnicity) & !is.na(base_salary) & base_salary > 0) %>%
  group_by(age, ethnicity) %>%
  summarise(mean_salary = mean(base_salary, na.rm = TRUE), .groups = "drop") %>%
  left_join(women_age_weights, by = "age") %>%
  filter(!is.na(weight)) %>%
  group_by(ethnicity) %>%
  summarise(
    weighted_mean = sum(mean_salary * weight, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(gender = "Women")

men_race_salary <- df_2025 %>%
  filter(gender == "Men" & !is.na(ethnicity) & !is.na(base_salary) & base_salary > 0) %>%
  group_by(ethnicity) %>%
  summarise(weighted_mean = mean(base_salary, na.rm = TRUE), .groups = "drop") %>%
  mutate(gender = "Men")

all_race_salary <- bind_rows(women_race_salary, men_race_salary) %>%
  pivot_wider(names_from = gender, values_from = weighted_mean, values_fill = NA) %>%
  arrange(desc(Men))

overall_women <- women_race_salary %>% summarise(avg = mean(weighted_mean)) %>% pull(avg)
overall_men <- men_race_salary %>% summarise(avg = mean(weighted_mean)) %>% pull(avg)

all_row <- data.frame(
  ethnicity = "ALL",
  Women = overall_women,
  Men = overall_men
)

final_race_salary_table <- bind_rows(all_race_salary, all_row)

kable(final_race_salary_table,
      caption = "Mean Base Salaries by Race and Gender",
      col.names = c("Race/Ethnicity", "Women", "Men"),
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(nrow(final_race_salary_table), bold = TRUE, background = "#e74c3c", color = "white")
Mean Base Salaries by Race and Gender
Race/Ethnicity Women Men
White 122,665.2 142,282.65
Other/Mixed 118,727.2 133,919.00
Black 101,784.4 117,911.00
Hispanic/Latinx 131,802.8 116,265.17
Asian 129,156.8 98,435.73
ALL 120,827.3 121,762.71

3.2 Commission and Bonus by Race

women_race_cb <- df_2025 %>%
  mutate(commission_bonus = (replace_na(commission, 0) + replace_na(bonus, 0))) %>%
  filter(gender == "Women" & !is.na(ethnicity) & !is.na(commission_bonus) & commission_bonus > 0) %>%
  group_by(age, ethnicity) %>%
  summarise(mean_cb = mean(commission_bonus, na.rm = TRUE), .groups = "drop") %>%
  left_join(women_age_weights, by = "age") %>%
  filter(!is.na(weight)) %>%
  group_by(ethnicity) %>%
  summarise(
    weighted_mean = sum(mean_cb * weight, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(gender = "Women")

men_race_cb <- df_2025 %>%
  mutate(commission_bonus = (replace_na(commission, 0) + replace_na(bonus, 0))) %>%
  filter(gender == "Men" & !is.na(ethnicity) & !is.na(commission_bonus) & commission_bonus > 0) %>%
  group_by(ethnicity) %>%
  summarise(weighted_mean = mean(commission_bonus, na.rm = TRUE), .groups = "drop") %>%
  mutate(gender = "Men")

all_race_cb <- bind_rows(women_race_cb, men_race_cb) %>%
  pivot_wider(names_from = gender, values_from = weighted_mean, values_fill = NA) %>%
  arrange(desc(Men))

overall_women_cb <- women_race_cb %>% summarise(avg = mean(weighted_mean)) %>% pull(avg)
overall_men_cb <- men_race_cb %>% summarise(avg = mean(weighted_mean)) %>% pull(avg)

all_cb_row <- data.frame(
  ethnicity = "ALL",
  Women = overall_women_cb,
  Men = overall_men_cb
)

final_race_cb_table <- bind_rows(all_race_cb, all_cb_row)

kable(final_race_cb_table,
      caption = "Mean Commission and Bonuses by Race and Gender",
      col.names = c("Race/Ethnicity", "Women", "Men"),
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(nrow(final_race_cb_table), bold = TRUE, background = "#e74c3c", color = "white")
Mean Commission and Bonuses by Race and Gender
Race/Ethnicity Women Men
Other/Mixed 31,986.82 154,000.00
Hispanic/Latinx 57,995.87 109,548.18
White 55,185.12 88,577.72
Black 43,298.58 40,173.29
Asian 25,470.11 21,555.56
ALL 42,787.30 82,770.95

3.3 Compensation by Specialization and Race

spec_race_data <- df_2025 %>%
  filter(!is.na(ethnicity) & !is.na(specialization_group) & 
         !is.na(total_compensation) & total_compensation > 0 &
         specialization_group != "Other") %>%
  group_by(specialization_group, ethnicity) %>%
  summarise(
    avg_salary = mean(total_compensation, na.rm = TRUE),
    .groups = "drop"
  )

spec_averages <- df_2025 %>%
  filter(!is.na(specialization_group) & !is.na(total_compensation) & 
         total_compensation > 0 & specialization_group != "Other") %>%
  group_by(specialization_group) %>%
  summarise(overall_avg = mean(total_compensation, na.rm = TRUE), .groups = "drop")

spec_race_analysis <- spec_race_data %>%
  left_join(spec_averages, by = "specialization_group") %>%
  mutate(
    difference = avg_salary - overall_avg,
    difference_formatted = ifelse(difference >= 0, 
                                 paste0("+$", format(round(difference), big.mark = ",")),
                                 paste0("-$", format(abs(round(difference)), big.mark = ",")))
  ) %>%
  select(specialization_group, ethnicity, avg_salary, difference_formatted)

unique_specs <- unique(spec_race_analysis$specialization_group)

if(length(unique_specs) > 0) {
  for(spec in unique_specs) {
    spec_table <- spec_race_analysis %>%
      filter(specialization_group == spec) %>%
      select(ethnicity, avg_salary, difference_formatted) %>%
      arrange(desc(avg_salary))
    
    if(nrow(spec_table) > 0) {
      cat("\n")
      cat("###", toupper(spec), "SPECIALIZATION\n\n")
      
      table_output <- kable(spec_table,
                col.names = c("Race/Ethnicity", "Average", "Difference relative to average"),
                format.args = list(big.mark = ","),
                caption = paste(spec, "Compensation by Race/Ethnicity")) %>%
        kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
      
      cat(table_output)
      cat("\n\n")
    }
  }
} else {
  cat("Insufficient data for specialization by race analysis\n")
}

3.3.1 ASSET MANAGEMENT SPECIALIZATION

Asset Management Compensation by Race/Ethnicity
Race/Ethnicity Average Difference relative to average
Asian 250,578.0 +$ 67,158
White 169,544.2 -$ 13,876
Hispanic/Latinx 160,521.4 -$ 22,899
Other/Mixed 143,257.8 -$ 40,162
Black 133,990.1 -$ 49,430

3.3.2 BROKERAGE SPECIALIZATION

Brokerage Compensation by Race/Ethnicity
Race/Ethnicity Average Difference relative to average
White 224,421.4 +$ 1,641
Black 153,500.0 -$ 69,280
Hispanic/Latinx 152,416.7 -$ 70,364
Asian 132,000.0 -$ 90,780
Other/Mixed 110,000.0 -$112,780

3.3.3 DEVELOPMENT SPECIALIZATION

Development Compensation by Race/Ethnicity
Race/Ethnicity Average Difference relative to average
Hispanic/Latinx 174,942.0 -$ 32,985
White 173,475.5 -$ 34,451
Other/Mixed 164,706.0 -$ 43,221
Asian 156,321.7 -$ 51,605
Black 131,727.4 -$ 76,199

3.3.4 FINANCE SPECIALIZATION

Finance Compensation by Race/Ethnicity
Race/Ethnicity Average Difference relative to average
Hispanic/Latinx 225,270.9 +$ 8,165
White 221,546.2 +$ 4,441
Asian 179,004.6 -$ 38,101
Other/Mixed 170,538.3 -$ 46,567
Black 150,328.2 -$ 66,777

4 Career Dynamics and Preferences

4.1 Compensation Changes and Projections

comp_change_data <- apply_women_weighting(df_2025, "m22A")

comp_change_filtered <- comp_change_data %>%
  filter(!is.na(response)) %>%
  mutate(
    response = case_when(
      response == "Increase:" ~ "Increased",
      response == "Decrease:" ~ "Decreased", 
      response == "Stayed the same" ~ "Stayed the Same",
      TRUE ~ response
    )
  )

ggplot(comp_change_filtered, aes(x = response, y = weighted_percent, fill = gender)) +
  geom_col(position = "dodge", alpha = 0.8, width = 0.7) +
  geom_text(aes(label = paste0(round(weighted_percent, 1), "%")), 
            position = position_dodge(width = 0.7), vjust = -0.5, size = 4, fontface = "bold") +
  scale_fill_manual(values = gender_colors) +
  scale_y_continuous(labels = function(x) paste0(x, "%"), expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Compensation Changes from 2023-2024 by Gender",
    subtitle = "Women's responses weighted by 2020 age distribution",
    x = "Compensation Change Direction",
    y = "Percentage of Respondents",
    fill = "Gender",
    caption = "Source: 2025 Survey Data"
  ) +
  theme_compensation

proj_change_data <- apply_women_weighting(df_2025, "m22C")

proj_change_filtered <- proj_change_data %>%
  filter(!is.na(response)) %>%
  mutate(
    response = case_when(
      response == "Increase:" ~ "Expect Increase",
      response == "Decrease:" ~ "Expect Decrease", 
      response == "Stay the same" ~ "Expect No Change",
      TRUE ~ response
    )
  )

ggplot(proj_change_filtered, aes(x = response, y = weighted_percent, fill = gender)) +
  geom_col(position = "dodge", alpha = 0.8, width = 0.7) +
  geom_text(aes(label = paste0(round(weighted_percent, 1), "%")), 
            position = position_dodge(width = 0.7), vjust = -0.5, size = 4, fontface = "bold") +
  scale_fill_manual(values = gender_colors) +
  scale_y_continuous(labels = function(x) paste0(x, "%"), expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Projected 2025 Compensation Changes by Gender",
    subtitle = "Women's responses weighted by 2020 age distribution",
    x = "Expected Compensation Change",
    y = "Percentage of Respondents",
    fill = "Gender",
    caption = "Source: 2025 Survey Data"
  ) +
  theme_compensation

4.2 Commission-Based Career Preferences

commission_data <- apply_women_weighting(df_2025, "m23B")

commission_filtered <- commission_data %>%
  filter(!is.na(response))

ggplot(commission_filtered, aes(x = response, y = weighted_percent, fill = gender)) +
  geom_col(position = "dodge", alpha = 0.8, width = 0.7) +
  geom_text(aes(label = paste0(round(weighted_percent, 1), "%")), 
            position = position_dodge(width = 0.7), vjust = -0.5, size = 4, fontface = "bold") +
  scale_fill_manual(values = gender_colors) +
  scale_y_continuous(labels = function(x) paste0(x, "%"), expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Willingness to Accept Commission-Based Positions by Gender",
    subtitle = "Women's responses weighted by 2020 age distribution",
    x = "Commission Willingness",
    y = "Percentage of Respondents",
    fill = "Gender",
    caption = "Source: 2025 Survey Data"
  ) +
  theme_compensation

comp_structure_data <- apply_women_weighting(df_2025, "m24")

comp_structure_filtered <- comp_structure_data %>%
  filter(!is.na(response)) %>%
  mutate(
    response = case_when(
      str_detect(response, "not part of the compensation structure") ~ "Commissions Not\nPart of Career",
      str_detect(response, "actively pursued") ~ "Actively Pursued\nCommission Career",
      str_detect(response, "does not play a significant role") ~ "Commission Role\nNot Significant",
      str_detect(response, "altered.*to avoid") ~ "Altered Career to\nAvoid Commission",
      response == "None of these" ~ "None of These",
      TRUE ~ str_wrap(response, 20)
    )
  )

ggplot(comp_structure_filtered, aes(x = reorder(response, weighted_percent), y = weighted_percent, fill = gender)) +
  geom_col(position = "dodge", alpha = 0.8, width = 0.7) +
  geom_text(aes(label = paste0(round(weighted_percent, 1), "%")), 
            position = position_dodge(width = 0.7), hjust = -0.1, size = 3.5) +
  scale_fill_manual(values = gender_colors) +
  scale_y_continuous(labels = function(x) paste0(x, "%"), expand = expansion(mult = c(0, 0.15))) +
  coord_flip() +
  labs(
    title = "Role of Compensation Structure in Career Path by Gender",
    subtitle = "Women's responses weighted by 2020 age distribution",
    x = "Career Path Relationship to Commission",
    y = "Percentage of Respondents",
    fill = "Gender",
    caption = "Source: 2025 Survey Data"
  ) +
  theme_compensation +
  theme(axis.text.y = element_text(size = 9))

5 Work-Life Balance and Career Satisfaction

5.1 Priority and Satisfaction Metrics

df_2025_clean_scales <- df_2025 %>%
  mutate(
    family_time_importance = clean_numeric_column(`m14A[SQ001]`),
    career_satisfaction = clean_numeric_column(`m14B[SQ007]`),
    earnings_importance = clean_numeric_column(`m14A[SQ009]`)
  )

family_time_data <- apply_compensation_weighting(df_2025_clean_scales, "family_time_importance")

ggplot(family_time_data, aes(x = gender, y = mean_salary, fill = gender)) +
  geom_col(width = 0.6, alpha = 0.8) +
  geom_text(aes(label = round(mean_salary, 1)), 
            vjust = -0.5, size = 5, fontface = "bold") +
  scale_fill_manual(values = gender_colors) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Importance of Family Time by Gender",
    subtitle = "Scale: 1 (Not Important) to 10 (Very Important)",
    x = "Gender",
    y = "Mean Importance Score",
    caption = "Source: 2025 Survey Data"
  ) +
  theme_compensation +
  guides(fill = "none")

career_sat_data <- apply_compensation_weighting(df_2025_clean_scales, "career_satisfaction")

ggplot(career_sat_data, aes(x = gender, y = mean_salary, fill = gender)) +
  geom_col(width = 0.6, alpha = 0.8) +
  geom_text(aes(label = round(mean_salary, 1)), 
            vjust = -0.5, size = 5, fontface = "bold") +
  scale_fill_manual(values = gender_colors) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Career Achievement Satisfaction by Gender",
    subtitle = "Scale: 1 (Not Satisfied) to 10 (Very Satisfied)",
    x = "Gender",
    y = "Mean Satisfaction Score",
    caption = "Source: 2025 Survey Data"
  ) +
  theme_compensation +
  guides(fill = "none")

earnings_imp_data <- apply_compensation_weighting(df_2025_clean_scales, "earnings_importance")

ggplot(earnings_imp_data, aes(x = gender, y = mean_salary, fill = gender)) +
  geom_col(width = 0.6, alpha = 0.8) +
  geom_text(aes(label = round(mean_salary, 1)), 
            vjust = -0.5, size = 5, fontface = "bold") +
  scale_fill_manual(values = gender_colors) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Importance of Maximizing Earnings Potential by Gender",
    subtitle = "Scale: 1 (Not Important) to 10 (Very Important)",
    x = "Gender",
    y = "Mean Importance Score",
    caption = "Source: 2025 Survey Data"
  ) +
  theme_compensation +
  guides(fill = "none")

6 Executive Summary

6.1 Key Gender Pay Gap Findings

base_salary_gap <- calculate_pay_gap(apply_compensation_weighting(df_2025, "base_salary"))
commission_gap <- calculate_pay_gap(apply_compensation_weighting(df_2025, "commission"))
bonus_gap <- calculate_pay_gap(apply_compensation_weighting(df_2025, "bonus"))
total_comp_gap <- calculate_pay_gap(apply_compensation_weighting(df_2025, "total_compensation"))

summary_gaps <- data.frame(
  Component = c("Base Salary", "Commission", "Bonus", "Total Compensation"),
  `Gender_Gap_Percent` = c(base_salary_gap, commission_gap, bonus_gap, total_comp_gap)
)

kable(summary_gaps,
      caption = "Gender Pay Gaps Summary - Key Compensation Components",
      col.names = c("Compensation Component", "Gender Gap (%)"),
      digits = 1) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(0, bold = TRUE, color = "white", background = "#e74c3c")
Gender Pay Gaps Summary - Key Compensation Components
Compensation Component Gender Gap (%)
Base Salary 19.9
Commission 29.0
Bonus 61.2
Total Compensation 36.0

6.2 Compensation Source Distribution

compensation_sources_2025 <- df_2025 %>%
  filter(!is.na(total_compensation) & total_compensation > 0 &
         !is.na(base_salary) & !is.na(bonus) & !is.na(commission) & 
         !is.na(profit_sharing) & !is.na(long_term_incentive)) %>%
  mutate(
    component_sum = base_salary + bonus + commission + profit_sharing + long_term_incentive,
    base_pct = (base_salary / component_sum) * 100,
    bonus_pct = (bonus / component_sum) * 100,
    commission_pct = (commission / component_sum) * 100,
    profit_pct = (profit_sharing / component_sum) * 100,
    lti_pct = (long_term_incentive / component_sum) * 100
  ) %>%
  filter(component_sum > 0) %>%
  group_by(gender) %>%
  summarise(
    Annual_Salary = round(mean(base_pct, na.rm = TRUE)),
    Short_Term_Bonus = round(mean(bonus_pct, na.rm = TRUE)),
    Commission = round(mean(commission_pct, na.rm = TRUE)),
    Profit_Sharing = round(mean(profit_pct, na.rm = TRUE)),
    Long_Term_Incentive = round(mean(lti_pct, na.rm = TRUE)),
    .groups = "drop"
  )

if(nrow(compensation_sources_2025) > 0) {
  sources_table <- compensation_sources_2025 %>%
    pivot_longer(cols = -gender, names_to = "Component", values_to = "Percentage") %>%
    pivot_wider(names_from = gender, values_from = Percentage) %>%
    mutate(
      Component = case_when(
        Component == "Annual_Salary" ~ "Annual Salary",
        Component == "Short_Term_Bonus" ~ "Short-Term Incentive Bonus",
        Component == "Commission" ~ "Commission",
        Component == "Profit_Sharing" ~ "Profit Sharing", 
        Component == "Long_Term_Incentive" ~ "Long-Term Incentive",
        TRUE ~ Component
      )
    )
  
  if(!"Women" %in% names(sources_table)) sources_table$Women <- NA
  if(!"Men" %in% names(sources_table)) sources_table$Men <- NA
  
  sources_table <- sources_table %>%
    select(Component, Women, Men)
  
  kable(sources_table,
        caption = "Sources of Compensation by Component (Percentage of Total Compensation)",
        col.names = c("Compensation Component", "Women (%)", "Men (%)"),
        digits = 0) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
}
Sources of Compensation by Component (Percentage of Total Compensation)
Compensation Component Women (%) Men (%)
Annual Salary 76 66
Short-Term Incentive Bonus 9 8
Commission 9 19
Profit Sharing 5 5
Long-Term Incentive 2 2